金庸小说人物嵌入可视化分析

导入库¶

In [3]:
import os
import re
import jieba
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from gensim.models import Word2Vec
import plotly.express as px
import plotly.graph_objects as go

读取小说文本¶

In [5]:
# 读取所有小说文件
novel_dir = "./Jinyong_novels"
novel_files = [f for f in os.listdir(novel_dir) if f.endswith(".txt")]

corpus = []
novel_tags = []

for file in novel_files:
    with open(os.path.join(novel_dir, file), "r", encoding="utf-8") as f:
        text = f.read()
        corpus.append(text)
        novel_tags.append(file.replace(".txt", ""))

预处理与分词¶

In [7]:
# 读取停用词表
with open("stopwords_cn.txt", "r", encoding="utf-8") as f:
    stopwords = set([line.strip() for line in f.readlines()])

# 使用结巴分词 + 停用词过滤
def preprocess(text):
    words = jieba.lcut(text)
    return [w for w in words if w.strip() and w not in stopwords and len(w) > 1]

tokenized_corpus = [preprocess(text) for text in corpus]
Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\XXX\AppData\Local\Temp\jieba.cache
Loading model cost 0.825 seconds.
Prefix dict has been built successfully.

训练 Word2Vec 模型¶

In [9]:
w2v_model = Word2Vec(
    sentences=tokenized_corpus,
    vector_size=100,
    window=5,
    min_count=5,
    sg=1,  # skip-gram
    workers=4,
    epochs=20
)

加载人物名并提取其词向量¶

In [11]:
# 加载人名词表
with open("./Jinyong_dic/人名.txt", "r", encoding="utf-8") as f:
    name_list = [line.strip() for line in f.readlines() if line.strip()]

# 提取人物名的词向量及所在小说
name_vecs = []
name_labels = []
name_sources = []

for idx, novel in enumerate(tokenized_corpus):
    novel_name = novel_tags[idx]
    for name in name_list:
        if name in novel and name in w2v_model.wv:
            name_vecs.append(w2v_model.wv[name])
            name_labels.append(name)
            name_sources.append(novel_name)

# 去重
df = pd.DataFrame({"name": name_labels, "source": name_sources})
df["vec"] = name_vecs
df = df.drop_duplicates(subset=["name"])

PCA降维¶

In [13]:
# 构建矩阵并降维
vectors = np.stack(df["vec"].to_numpy())
pca_2d = PCA(n_components=2)
pca_3d = PCA(n_components=3)

df["x_2d"], df["y_2d"] = pca_2d.fit_transform(vectors).T
df["x_3d"], df["y_3d"], df["z_3d"] = pca_3d.fit_transform(vectors).T

二维交互式可视化¶

In [15]:
fig_2d = px.scatter(
    df,
    x="x_2d", y="y_2d",
    color="source",
    text="name",
    title="金庸小说人物词向量的二维PCA可视化",
    width=800,
    height=600
)
fig_2d.update_traces(textposition='top center')
fig_2d.write_html("金庸小说人物词向量的二维PCA可视化.html")
fig_2d.show()

三维交互式可视化¶

In [17]:
fig_3d = px.scatter_3d(
    df,
    x="x_3d", y="y_3d", z="z_3d",
    color="source",
    text="name",
    title="金庸小说人物词向量的三维PCA可视化",
    width=900,
    height=700
)
fig_3d.update_traces(marker=dict(size=5), textposition='top center')
fig_3d.write_html("金庸小说人物词向量的三维PCA可视化.html")
fig_3d.show()